# 拓展:爬取慢慢买网站"神价监控"数据
# http://tool2.manmanbuy.com/PriceLessSort.aspx
# 1. 获取JD降价商品内容,与转向连接/或者直接获取商品廉价->3
# 2. 获取去看看的url
# 3. 格式化处理url,生成纯净的商品名称价格信息与url

import requests
import re
import pandas as pd

# 全局变量
items = []


def parse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
    }
    resp = requests.get(url=url, headers=headers)
    text = resp.text
    with open('manmanbuy.html', 'w', encoding='utf-8') as f:
        f.write(text)
    # 信息结构:r'<li class="proitem">(.*?)</li>'
    item_times = re.findall(r'<li class="proitem">.*?<span>(.*?)</span>', text, re.S)
    item_imgs = re.findall(r'<li class="proitem">.*?src="(.*?)".*?</div>', text, re.S)
    item_names = re.findall(r'<li class="proitem">.*?title=.*?>(.*?)</a>', text, re.S)
    # 商城
    item_stores = re.findall(r'class=\'proinfo\'.*?alt=\'(.*?)\'', text, re.S)
    # 商品链接
    # tem_urls = re.findall(r'<li class="proitem">.*?href="(.*?)".*?</div>', text, re.S)
    # JD纯链接
    item_urls = re.findall(r'<li class="proitem">.*?href=".*?url=(.*?)".*?</div>', text, re.S)
    # 原本价格,t7
    item_original_prices = re.findall(r'class="t7">(.*?)</div>', text, re.S)
    # 省钱
    item_cuts = re.findall(r'<li class="proitem">.*?title="(.*?)"', text, re.S)
    # 当前价格,t3
    item__now_prices = re.findall(r'class="t3">(.*?)</div>', text, re.S)
    # 折扣
    item_discounts = re.findall(r'<li class="proitem">.*?title=".*?">(.*?)</a>', text, re.S)
    # 历史最低价格,t5
    item_lowests = re.findall(r'class="t5".*?<span style=.*?>(.*?)</span>', text, re.S)
    # zip打包
    for value in zip(item_times, item_stores, item_names, item_imgs, item_urls, item_original_prices, item_cuts,
                     item__now_prices,
                     item_discounts, item_lowests):
        item_time, item_store, item_name, item_img, item_url, item_original_price, item_cut, item__now_price, item_discount, item_lowest = value
        # 字典,存入items列表
        item = {'item_time': item_time,
                'item_store': item_store,
                'item_name': item_name,
                'item_img': item_img,
                'item_url': item_url,
                'item_original_price': item_original_price,
                'item_cut': item_cut,
                'item__now_price': item__now_price,
                'item_discount': item_discount,
                'item_lowest': item_lowest,

                }
        items.append(item)
    print(item_stores)
    print(len(item_stores))
    print(items)
    # 测试数量是否正确
    L = [item_times, item_stores, item_names, item_imgs, item_urls, item_original_prices, item_cuts, item__now_prices,
         item_discounts, item_lowests]
    for i in L:
        print(str(len(i)) + ',', end='')
    return items


# 写入excel表格
def write_to_excel(items):
    df = pd.DataFrame(items)
    df.to_excel('items.xlsx', index=False)


def main(n):
    for i in range(n):
        # url_all = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1,6,13__0_1_0_8_6_2__0_0__s0'.format(i)
        # 设置查询条件为jd,获取headers中的q参数.
        url_jd = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1_0_0_1_0_8_6_2__0_0__s0_0'.format(i)
        print(url_jd)
        parse_page(url_jd)
        # write_to_excel(items)


if __name__ == '__main__':
    # 输入需要爬取的页数,执行主程序
    main(100)